#' ---
#' title: "Campaign Communication and Legislative Leadership (PSRM)"
#' subtitle: "04_clean_transformers_output.R"
#' author: "Authors: Stefan Mueller and Naofumi Fujimura"
#' date: "Note: Code compiled successfully on `r format(Sys.time(), '%d %B %Y')`"
#' ---

# load packages
library(dplyr)   # CRAN v1.1.2
library(scales)  # CRAN v1.2.1
library(ggplot2) # CRAN v3.4.2
library(readr)   # CRAN v2.1.4
library(xtable)  # CRAN v1.8-4
library(stringr) # CRAN v1.5.0


# If the code does not run, one or more packages may have been 
# updated, which may result in errors or conflicts. You can solve this issue
# by installing the package version listed above or by using the 
# groundhog package:
# after installing groundhog using install.packages("groundhog")
# change library(name_of_package) to
# groundhog::groundhog.library(name_of_package, date = "2024-01-31")
# Instead of adjusting the library() function for each package, 
# you can adjust them at all once using the
# the following syntax:
# groundhog.library("library('pkgA')
#                   library('pkgB')
#                   library('pkgC')", date = "2024-01-31")
# More details are available at: https://groundhogr.com/using/

# print output of sessionInfo()
sessionInfo()


# load custom ggplot2 scheme
source("function_theme_base.R")

# load BERT-classified data
dat_predicted <- readr::read_csv("data_predicted_full.csv")



names(dat_predicted)
nrow(dat_predicted)

# load set of 3,000 hand-coded statements
dat <- read.csv("data_handcoded_sentences.csv",
                fileEncoding = "utf-8",
                stringsAsFactors = FALSE)


# recode type of statement
dat_policy <- dat |> 
    filter(policy_area != "No policy area") |> # focus only on policy statements
    mutate(`Type of policy content` = dplyr::recode(type, 
                                     "clarification" = "Clarification and details",
                                     "credit_claiming" = "Credit claiming",
                                     "former_jobs" = "Former jobs and personal background",
                                     "pledge" = "Pledge", .default = "Policy content but no pledge",
                                     .missing = "Policy content but no pledge")) |> 
    group_by(`Type of policy content`) |> 
    count(sort = TRUE, name = "Number of statements") |> 
    ungroup() |> 
    mutate(Percent = paste0(round(100 * `Number of statements` / sum(`Number of statements`), 1), "%"))


dat_policy

# Save as Table A5
print(xtable(dat_policy, digits = 0), 
      file = "tab_a05.html",
      type = "html", include.rownames=FALSE)

# Get examples for each category
set.seed(235)
dat_examples <- dat |> 
    filter(policy_area != "No policy area") |> # focus only on policy statements
    mutate(`Type of policy content` = dplyr::recode(type, 
                                                    "clarification" = "Clarification and details",
                                                    "credit_claiming" = "Credit claiming",
                                                    "former_jobs" = "Former jobs and personal background",
                                                    "pledge" = "Pledge", .default = "Policy content but no pledge",
                                                    .missing = "Policy content but no pledge")) |> 
    group_by(`Type of policy content`) |> 
    sample_n(size = 5) |> 
    dplyr::select(`Type of policy content`, sentence, sentence_id)

dat_examples


nrow(dat)

# Group by policy area and get proportions of policy areas
# in hand-coded sample
dat_predicted_count_annotated <- dat |> 
    group_by(policy_area) |> 
    count() |> 
    ungroup() |> 
    mutate(prop = n / sum(n)) |> 
    mutate(policy_area = dplyr::recode(policy_area, "No policy area" = "No/Other Policy Area")) |> 
    mutate(policy_area = str_replace_all(policy_area, "Labour", "Labor")) 
    

sum(dat_predicted_count_annotated$prop)


# Create Figure A6 
ggplot(dat_predicted_count_annotated, aes(x = prop, y = reorder(policy_area, prop))) +
    geom_bar(stat = "identity") +
    geom_text(aes(label = paste0(sprintf("%.1f%%; n=", round(100 * prop, 1)), str_trim(format(n, big.mark = ",")))), 
              nudge_x = 0.005, hjust = 0,
              colour = "grey50") +
    scale_y_discrete(labels = scales::label_wrap(40)) +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1),
                       limits = c(0, 0.55),
                       breaks = c(seq(0, 0.5, 0.1))) +
    labs(x = "Percentage of Hand-Coded Statements", y = NULL)
ggsave("fig_a06.pdf",
       width = 9, height = 6)

# rename variabe
dat_predicted <- dat_predicted |> 
    rename(manifesto_id = doc_id)

# recode predicted class
dat_predicted <- dat_predicted |> 
    mutate(predicted_class_bert = dplyr::recode(label,
                                                "0" = "Agriculture, Forestry, and Fisheries",
                                                "1" = "Committees on Cabinet",
                                                "2" = "Economy, Trade and Industry",
                                                "3" = "Education, Culture, Sports, Science, and Technology",
                                                "4" = "Environment",
                                                "5" = "Financial Affairs",
                                                "6" = "Foreign Affairs",
                                                "7" = "Health, Labor, and Welfare",
                                                "8" = "Internal Affairs and Communications",
                                                "9" = "Land, Infrastructure, Transport, and Tourism",
                                                "10" = "No policy area",
                                                "11" = "Security"
    ))

# aggregate to manifesto level
dat_pred_manifesto_bert <- dat_predicted |>   
    group_by(manifesto_id) |> 
    mutate(n_sentences_manifesto = n()) |> 
    ungroup() |> 
    group_by(manifesto_id, n_sentences_manifesto, predicted_class_bert) |>   
    summarise(n_sentences_class_bert = n()) |>  
    mutate(prop_policyarea_bert = n_sentences_class_bert / sum(n_sentences_class_bert)) 


# repeat without "no policy area"
dat_pred_manifesto_bert_only_policies <- dat_predicted |>    
    filter(predicted_class_bert != "No policy area") |> 
    group_by(manifesto_id, predicted_class_bert) |>   
    summarise(n_sentences_class_policyareas_bert = n()) |> 
    group_by(manifesto_id) |> 
    mutate(n_sentences_manifesto_policyareas_bert = sum(n_sentences_class_policyareas_bert)) |> 
    mutate(prop_policyarea_relevant_bert = n_sentences_class_policyareas_bert / n_sentences_manifesto_policyareas_bert)


dat_pred_merged <- full_join(dat_pred_manifesto_bert, 
                             dat_pred_manifesto_bert_only_policies,
                             by = c("manifesto_id", "predicted_class_bert"))


dat_pred_merged$predicted_class <- dat_pred_merged$predicted_class_bert


dat_pred_manifesto_bert_only_policies <- dat_pred_manifesto_bert_only_policies |> 
    mutate(year = suppressWarnings(readr::parse_number(manifesto_id))) |> 
    mutate(year = ifelse(manifesto_id %in% c("YamamotoTomohiro", "ToyodaMayuko", 
                                             "SugaYoshihide", "KoizumiShinjiro"), 
                         12,
                         ifelse(manifesto_id == "Tanigawayaichi", 05, year))) |> 
    mutate(year = as.character(year)) |> 
    mutate(year = dplyr::recode(year, "3" = "2003", # recode year based on filename
                                "5" = "2005",
                                "9" = "2009",
                                "12" = "2012",
                                "14" = "2014"))

table(dat_pred_manifesto_bert_only_policies$year)

# save as RDS file for further analyses
saveRDS(dat_pred_merged, "data_policy_areas_manifesto_bert.rds")
